library(readr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
ny_house_data <- read_csv("/Users/cyn_chen/Desktop/Group_C_NYCHouse/Data/NY-House-Dataset.csv")
## Rows: 4801 Columns: 17
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (11): BROKERTITLE, TYPE, ADDRESS, STATE, MAIN_ADDRESS, ADMINISTRATIVE_AR...
## dbl  (6): PRICE, BEDS, BATH, PROPERTYSQFT, LATITUDE, LONGITUDE
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# For simplicity, remove rows with missing values
ny_house_data <- ny_house_data %>% na.omit()

# Convert relevant columns to numeric
ny_house_data$price <- as.numeric(ny_house_data$PRICE)
ny_house_data$beds <- as.numeric(ny_house_data$BEDS)
ny_house_data$bath <- as.numeric(ny_house_data$BATH)
ny_house_data$propertysqft <- as.numeric(ny_house_data$PROPERTYSQFT)

# Calculate the first quartile (Q1), third quartile (Q3), and IQR
Q1 <- quantile(ny_house_data$PRICE, 0.25)
Q3 <- quantile(ny_house_data$PRICE, 0.75)
IQR <- Q3 - Q1

# Define the lower and upper bounds for outliers
lower_bound <- Q1 - 1.5 * IQR
upper_bound <- Q3 + 1.5 * IQR

# Filter out rows with `PRICE` values outside of the bounds
ny_house_data <- ny_house_data %>%
  filter(PRICE >= lower_bound & PRICE <= upper_bound)
#Extracting zip code from column STATE, then store in new column ZIP_CODE
library(stringr)
ny_house_data$ZIP_CODE <- str_extract(ny_house_data$STATE, "\\d{5}$")

#Add a new column, price per sqft
ny_house_data$PRICE_PER_SQFT<-ny_house_data$PRICE/ny_house_data$PROPERTYSQFT

# Group the data frame by ZIP_CODE and calculate the median price
median_prices_per_sqft <- ny_house_data %>%
  group_by(ZIP_CODE) %>%
  summarize(median_price = median(PRICE/PROPERTYSQFT, na.rm = TRUE))

median_prices_per_sqft$ZIP_CODE<-as.integer(median_prices_per_sqft$ZIP_CODE)
#Import Zip Code Boundaries file for New York City
ny_zipcode_shape<-read.csv("/Users/cyn_chen/Desktop/Group_C_NYCHouse/Data/Modified_Zip_Code_Tabulation_Areas__MODZCTA_.csv")
zip_codes_with_prices <- left_join(ny_zipcode_shape,median_prices_per_sqft,by=c("MODZCTA"="ZIP_CODE"))

# Remove rows with missing values
zip_codes_with_prices <- zip_codes_with_prices[complete.cases(zip_codes_with_prices), ]

# Drop unnecessary columns by name
columns_to_drop <- c("label", "ZCTA","pop_est")
zip_codes_with_prices <- zip_codes_with_prices[, !(names(zip_codes_with_prices) %in% columns_to_drop)]

zip_codes_with_prices$MODZCTA<-as.numeric(zip_codes_with_prices$MODZCTA)
ny_house_data_map<-data.frame(
  FORMATTED_ADDRESS=ny_house_data$FORMATTED_ADDRESS,
  TYPE=ny_house_data$TYPE,
  TOTAL_PRICE=ny_house_data$PRICE,
  PRICE_PER_SQFT=ny_house_data$PRICE_PER_SQFT,
  LONGITUDE=ny_house_data$LONGITUDE,
  LATITUDE=ny_house_data$LATITUDE
)
library(leaflet)
# Create the Leaflet map
p1 <- leaflet(ny_house_data_map) %>%
  addTiles() %>%    
  addCircleMarkers(lng = ~ny_house_data_map$LONGITUDE, lat = ~ny_house_data_map$LATITUDE,
             popup = ~paste("<b>Address:</b>", ny_house_data$FORMATTED_ADDRESS,"<br>",
                            "<b>Establishment Type:</b> ", ny_house_data$TYPE,"<br>",
                            "<b>Price per sqft:</b> ",PRICE_PER_SQFT=ny_house_data$PRICE_PER_SQFT),
               clusterOptions = markerClusterOptions())
p1
library(sf)
## Linking to GEOS 3.11.0, GDAL 3.5.3, PROJ 9.1.0; sf_use_s2() is TRUE
library(viridis)
## Loading required package: viridisLite
# Convert the character strings to spatial objects
zip_codes_with_prices <- st_as_sf(zip_codes_with_prices, wkt = "the_geom")

# Create a color palette based on the average prices
pal <- colorNumeric(palette = "viridis", domain = zip_codes_with_prices$median_price)

# Create a leaflet map centered over a specific location 
leaflet_map <- leaflet(data = zip_codes_with_prices) %>%
  addTiles() %>%
  addPolygons(
    fillColor = ~pal(median_price),  # Color by median_price
    fillOpacity = 0.7,
    color = "black",  # Border color
    weight = 1,  # Border thickness
    popup = ~paste("Zip Code:", MODZCTA, "<br>Median Price:", median_price),  # Add popup info
    highlight = TRUE  # Highlight on hover
  ) %>%
  addLegend(
    pal = pal,
    values = zip_codes_with_prices$median_price,
    title = "Median Price Per Sqft",
    position = "bottomright"  # Legend position
  )

# Display the map
leaflet_map